library(recommenderlab)
library(ggplot2)

# 2.1
temp=read.csv("data/MovieLense.csv") 
#MovieLenseMeta=read.csv("MovieLenseMeta.csv")

#temp[1:271,]
MovieLense=as(temp,"realRatingMatrix")
DF=getData.frame(MovieLense)
dim(DF)
dim(subset(DF,user==1))

methods_matrix <- methods(class = class(MovieLense))
methods_to_print <- as.character(methods_matrix)
methods_to_print <- methods_to_print[!grepl("coerce", methods_to_print)]
methods_to_print <- gsub(",.*", "", methods_to_print, perl = TRUE)
methods_to_print <- c(methods_to_print, "", "")
pander::pander(matrix(methods_to_print, ncol = 3))

# Usually, rating matrices are sparse matrices. 
# For this reason, the realRatingMatrix class supports a compact storage of sparse matrices. 
# Let's compare the size of MovieLense with the corresponding R matrix:
object.size(MovieLense)

object.size(as(MovieLense, "matrix"))

#We can compute how many times the recommenderlab matrix is more compact:
object.size(as(MovieLense, "matrix"))/object.size(MovieLense)

# As expected, MovieLense occupies much less space than the equivalent standard
# R matrix. 
# The rate is about 1:9, and the reason is the sparsity of MovieLense.
# A standard R matrix object stores all the missing values as 0s, so it stores 15 times
# more cells.

# Exercise 1: Download ml-latest-small.zip from the newly released MovieLens 
# for education. Compile a dataset as the one, MovieLense.csv, used by code. 


#===== Computing the similarity matrix =====#
# Collaborative filtering algorithms are based on measuring the similarity
# between "users" or between "items." 
# Similarity functions: "cosine","pearson","jaccard".
# Pick the first five users' rating

similarity_items <- similarity(MovieLense[,1:5], 
                               method = "cosine", 
                               which = c("items","users")[1])
similarity_users <- similarity(MovieLense[1:5,], 
                               method = "cosine", 
                               which = c("items","users")[2])
class(similarity_items)

# We then convert similarity_users into a matrix and visualize it.
as.matrix(similarity_users)
# The more red the cell is, the more similar two users are. 
# Note that the diagonal is red, since it's comparing each user with itself:
similarity_users_as_matrix=as.matrix(similarity_users)

dev.new();image(similarity_users_as_matrix, main = "User similarity") #Fig 2.1-3

# Using the same approach, we can compute and visualize the similarity 
# between the first five items:
similarity_items_as_matrix=as.matrix(similarity_items)
dev.new();image(similarity_items_as_matrix, main = "Item similarity")

# Using image, we can visualize the matrix. Each row and each column corresponds to
# a user, and each cell corresponds to the similarity between two users:
# The more red the cell is, the more similar two users are. Note that the diagonal is red,
# since it's comparing each user with itself.

## Check Recommendation Models with respect to data type
recommender_models <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommender_models)

df_models <- data.frame(model = names(recommender_models))

## Let's take a look at their description
lapply(recommender_models, "[[", "description")
#Out of them, we will use IBCF and UBCF.

recommender_models$IBCF_realRatingMatrix$parameters


df_parameters <- data.frame(
  parameter = names(recommender_models$IBCF_realRatingMatrix$parameters),
  default = unlist(recommender_models$IBCF_realRatingMatrix$parameters)
  )
rownames(df_parameters) <- NULL

## ========================= ##
## Exploratory Data Analysis ##
## ========================= ##

#########################################
## 1. Exploring the nature of the data ##
#########################################
# Let's take a quick look at MovieLense. As explained in the previous section,
# there are some generic methods that can be applied to realRatingMatrix
# objects. We can extract their size using dim:
dim(MovieLense)
# There are 943 users and 1664 movies. Since realRatingMatrix is an S4 class, the
# components of the objects are contained in MovieLense slots. 
# We can see all the slots using slotNames which stores all the data within an object:
slotNames(MovieLense)
# MovieLense contains a data slot. Let's take a look at it.
class(MovieLense@data)
dim(MovieLense@data)

# MovieLense@data belongs to the dgCMatrix class that inherits from Matrix. 
# In order to perform custom data exploration, we might need to access this slot.  

###########################################
## 2. Exploring the values of the rating ##
###########################################

# Starting from the slot data, we can explore the matrix. 
# Let's take a look at the ratings.
# We can convert the matrix into a vector and explore its values:
vector_ratings <- as.vector(MovieLense@data)
unique(vector_ratings)

# The ratings are integers in the range 0-5. 
# Let's count the occurrences of each of them.
table_ratings <- table(vector_ratings)
summary.factor(vector_ratings)
table_ratings

df_ratings <- data.frame(
  rating = names(table_ratings),
  occurrences = as.vector(table_ratings)
  )

# According to the documentation, a rating equal to 0 represents a missing value,
# so we can remove them from vector_ratings
vector_ratings <- vector_ratings[vector_ratings != 0]

# Now, we can build a frequency plot of the ratings. 
# In order to visualize a bar plot with frequencies, we can use ggplot2. 
# Let's convert them into categories using factor and build a quick chart

vector_ratings_DF=data.frame(ID=seq(length(vector_ratings)),
                             rating=vector_ratings)
dev.new();ggplot(vector_ratings_DF,aes(x=rating)) + 
  geom_bar() +
  ggtitle("評分數字分佈") +
  xlab("評分") #Fig. 2.1-4


# Most of the ratings are above 2, and the most common is 4.

################################################
## 3. Exploring which movies have been viewed ##
################################################

# Starting with MovieLense, we can easily extract quick results using methods
# such as the following methods(functions):
  # colCounts: This is the number of non-missing values for each column
  # colMeans: This is the average value for each column

# For instance, which are the most viewed movies? 
# We can use colCounts for this purpose. 
# First, let's count the views for each movie:

views.per.movie = colCounts(MovieLense)
head(as.data.frame(views.per.movie))

## Then, we can sort the movies by number of views:
table.views = data.frame(
  movie = names(views.per.movie),
  views = views.per.movie
  )
table.views = table.views[order(table.views$views, decreasing = TRUE), ]

# Now, we can visualize the first six rows and build a histogram,
# The following image shows the number of views of the top movies:

dev.new(); ggplot(table.views[1:10, ], aes(x = movie, y = views)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle("Number of views of the top movies")+
  coord_flip() #Fig. 2.1-6

# In the preceding chart, you can notice that Star Wars (1977) 
# is the most viewed movie, exceeding the others by about 100 views.

# Exercise 2. Please sort the x-axis by views, rather than by alphabet.
dev.new();ggplot(table.views[1:10, ], aes(x = reorder(movie,-views), y = views)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle("Number of views of the top movies")+
  coord_flip()#Fig. 2.1-7

######################################
## 4. Exploring the average ratings ##
######################################

# We can identify the top-rated movies by computing the average rating of each of
# them. For this purpose, we can use colMeans; it automatically ignores the 0s, 
# since they represent missing values. 
# Let's take a look at the distribution of the average movie rating:

avg.ratings <- data.frame(avg.ratings=colMeans(MovieLense))
head(avg.ratings)


# Let's build the chart
dev.new();ggplot(avg.ratings, aes(x=avg.ratings)) +
  stat_bin(binwidth = 0.1) +
  ggtitle("Distribution of the average movie rating") + 
  xlab("Average Ratings") #Fig. 2.1-8

# The highest value is around 3, and there are a few movies whose rating is 
# either 1 or 5. Probably, the reason is that these movies received a 
# rating from a few people only, so we shouldn't take them into account. 
# We can remove the movies whose number of views is below a defined threshold,
# for instance, below 100:

table(cut(views.per.movie,10))
avg.ratings2 <- data.frame(avg.ratings_relevant=avg.ratings[views.per.movie > 80,])
head(avg.ratings2)

# Let's re-build the chart:

dev.new();ggplot(avg.ratings2,aes(x=avg.ratings_relevant)) +
  stat_bin(binwidth = 0.1) +
  ggtitle(paste("Distribution of the relevant average ratings")) + 
  xlab("Average Ratings for views.per.movie > 80") #Fig. 2.1-9

# All the rankings are between 2.3 and 4.5. As expected, we removed the extremes. The
# highest value changes, and now, it is around 4.

###############################
## 5. Visualizing the matrix ##
###############################

# We can visualize the matrix by building a heat map whose colors represent the
# ratings. 
# Each row of the matrix corresponds to a user, each column to a movie, and
# each cell to its rating. 
# For this purpose, we can use the generic method "image". 
# The recommenderlab package redefined the method image for realRatingMatrix objects.
# Let's build the heatmap using image:

dev.new();image(MovieLense, main = "Heatmap of the rating matrix") #Fig. 2.1-10

# We can notice a white area in the top-right region. 
# The reason is that the row and columns are sorted.
# Since there are too many users and items, this chart is hard to read. 
# We can build another chart zooming in on the first rows and columns.
# Let's re-build the heat map using image:

dev.new();image(MovieLense[1:15, 1:25],
      main = "Heatmap of the first rows and columns")#Fig. 2.1-11

## image plot can be used to identify features in a data set
## (e.g., recording problems with some transactions containing all items).

#== Discussion of the heatmap
# Some users saw more movies than the others. 
# However, this chart is just displaying some random users and items. 
# What if, instead, we select the most relevant users and items? 
# This means visualizing only the users who have seen many movies and the
# movies that have been seen by many users. 
# To identify and select the most relevant users and movies, we may:
# (1) Determine the minimum number of movies per user.
# (2) Determine the minimum number of users per movie.
# (3) Select the users and movies matching these criteria.

# For instance, we can visualize the top percentile of users and movies. 
# In order to do this, we use the quantile function:


dev.new();image(MovieLense[rowCounts(MovieLense) >= 400,
                           colCounts(MovieLense) >= 300],
                main = "Heatmap of the top users and movies")# Fig. 2.1-12



  
min_movies <- quantile(rowCounts(MovieLense), 0.99)
min_users <- quantile(colCounts(MovieLense), 0.99)
min_movies
min_users

# Now, we can visualize the rows and columns matching the criteria.
# Let's re-build the heat map using image:

dev.new();image(MovieLense[rowCounts(MovieLense) > min_movies,
                 colCounts(MovieLense) > min_users],
                main = "Heatmap of the top 1% users and movies") # Fig. 2.1-13

# Let's take account of the users having watched more movies. 
# Most of them have seen all the top movies, and this is not surprising. 
# We can notice some columns that are darker than the others. 
# These columns represent the highest-rated movies.
# Conversely, darker rows represent users giving higher ratings. 
# Because of this, we might need to normalize the data.

# So far, we have explored the data. 
# Next, we will process and transform the inputs for the recommendation models.







# More visualization

####################################
##==   Data preparation for RS  ==##
####################################
# The code chunk here will show you the steps to prepare the data for recommender:
 # Step 1. Select the relevant data.
 # Step 2. Normalize the data.

##== Step 1. Selecting the most relevant data

# When we explored the data, we noticed that the table contains:
# (1) Movies that have been viewed only a few times: Their ratings might be biased.
# (2) Users who rated only a few movies: Their ratings might be biased.
# We need to determine the minimum number of users per movie and vice versa. 

# The correct solution comes from iterating the entire process of preparing the data,
# building a recommendation model, and validating it. 

# Since we are implementing the model for the first time, we can use a rule of thumb:
# After having built the models, we can come back and modify the data preparation.
# We will define ratings_movies containing the matrix that we will use. 
# It takes account of:
# (1) Users who have rated at least 50 movies
# (2) Movies that have been watched at least 100 times

# The preceding points define the following code:
ratings_movies <- MovieLense[rowCounts(MovieLense) > 50,
                             colCounts(MovieLense) > 100]
ratings_movies

# The ratings_movies object contains about a half of the users and 
# a fifth of the movies in comparison with MovieLens

# To exploring "the most relevant data", we visualize the top matrix:
# As we did previously, we visualize top 2% of users and movies in ratings_movies:

min_movies <- quantile(rowCounts(ratings_movies), 0.98)
min_users <- quantile(colCounts(ratings_movies), 0.98)

# Let's build the heatmap:
dev.new();image(ratings_movies[rowCounts(ratings_movies) > min_movies,
                               colCounts(ratings_movies) > min_users],
                main = "Heatmap of the top users and movies") 

# As we already noticed, some rows are darker than the others. 
# This might mean that some users give higher ratings to all the movies.
# However, we have visualized the top movies only. 
# To overview all users, we check the distribution of the user-specific average rating:
avg.ratings_per_user <- rowMeans(ratings_movies)

# Let's visualize the distribution:
dev.new();qplot(avg.ratings_per_user) +
  stat_bin(binwidth = 0.1) +
  ggtitle("Distribution of the average rating per user") + 
  xlab("Average ratings per user")

# As expected, there is huge difference among users. Then we transform the data.
# Having users who give high (or low) ratings to all their movies might
# bias the results.
# We can remove this effect by normalizing the data in such a way that the average
# rating of each user is 0, which can be done by the built-in normalize function:

##== Step 2.  Normalization ##

ratings_movies_norm <- normalize(ratings_movies)

# Let's take a look at the average rating by users:
sum(rowMeans(ratings_movies_norm) > 0.00001) #Average rating by users

# As expected, the mean rating of each user is 0 (apart from the approximation error).
# We can visualize the new matrix using image. 
# Let's re-build the heat map to visualize the normalised matrix
min_movies <- quantile(rowCounts(ratings_movies), 0.98)
min_users <- quantile(colCounts(ratings_movies), 0.98)

dev.new();image(ratings_movies_norm[rowCounts(ratings_movies_norm) > min_movies,
                          colCounts(ratings_movies_norm) > min_users],
      main = "Heatmap of the top 2% users and movies")

# The first difference that we can notice is the colors, and this is because the data
# is continuous. Original rating was an integer between 1 and 5. 
# After the normalization, the rating can be any number between -5 and 5.
# There are still some lines that are more blue and some that are more red.
# The reason is that we are visualizing only the top movies. 
# We already checked that the average rating is 0 for each user.

min_movies <- quantile(rowCounts(ratings_movies), 0.97)
min_users <- quantile(colCounts(ratings_movies), 0.97)
dev.new();image(ratings_movies_norm[rowCounts(ratings_movies_norm) > min_movies,
                                    colCounts(ratings_movies_norm) > min_users],
                main = "Heatmap of the top 3% users and movies")


##== Step 3.  Binarization ##

# Some recommendation models work on binary data, so we might want to binarize
# our data, that is, define a table containing only 0s and 1s. 
# The 0s will be either treated as missing values or as bad ratings.
# In our case, we can either:
# (1) Define a matrix having 1 if the user rated the movie, and 0 otherwise. In this
# case, we are losing the information about the rating.
# (2) Define a matrix having 1 if the rating is above or equal to a definite threshold
# (for example, 3), and 0 otherwise. 
# In this case, giving a bad rating to a movie is equivalent to not having rated it.
# Depending on the context, one choice is more appropriate than the other.
# The function to binarize the data is binarize. 
# Let's apply it to our data. 
# First, let's define a matrix equal to 1 if the movie has been watched, 
# that is if its rating is at least 1:
  
ratings_movies_watched <- binarize(ratings_movies, minRating = 1)
min_movies_binary <- quantile(rowCounts(ratings_movies), 0.95)
min_users_binary <- quantile(colCounts(ratings_movies), 0.95)

# Let's take a look at the results. 
# In this case, we will have black-and-white charts so that we can visualize 
# a larger portion of users and movies, for example, 5 percent.
# Similarly, let's select this 5 percent using quantile. 
# The row and column counts are the same as the original matrix, 
# so we can still apply rowCounts and colCounts on ratings_movies:
# Let's re-build the heat map to show the top users and movies:  
dev.new();image(ratings_movies_watched[rowCounts(ratings_movies) > min_movies_binary,
                             colCounts(ratings_movies) > min_users_binary],
      main = "Heatmap of the top 5% users and movies")

# Only a few cells contain un-watched movies. This is just because we selected the top
# users and movies.
# Let's use the same approach to compute and visualize the other binary matrix 
# The cells having a rating above the threshold will have their value equal to 1 and the
# other cells will be 0s:
ratings_movies_good <- binarize(ratings_movies, minRating = 3)

# Let's re-build the heat map to show the the top users and movies:
dev.new();image(ratings_movies_good[rowCounts(ratings_movies) > min_movies_binary,
                                    colCounts(ratings_movies) > min_users_binary],
                main = "Heatmap of the top users and movies")

# As expected, we have more white cells now. 
# Depending on the model, we can use the raw ratings matrix, or normalize/binarize it.

# Exercise 3. Re-work this code with dataset created by Exercise 1.

# In this code, we prepared the data to perform recommendations. 
# In the upcoming code, we will build collaborative filtering models.
